PCA

pca <- prcomp(emb)
plot(pca,type="l",col="red")

pca.sum <- summary(pca)
plot(pca.sum$importance["Cumulative Proportion",],type="l")
abline(v=50,col="red")
abline(h=0.5,col="blue")

pc <- as.data.frame(pca$x)
pcn <- normalize(pc)

Diferencias entre espacio original (emb), pca (pc) y pca normalizado (pcn)

w1 <- "rey"
w2 <- "reina"
w3 <- "hombre"

similares(w1,x=emb)
##                   word freq   cos.sim
## 6051             reyes    1 0.7648581
## 3340             reino    1 0.7598407
## 3829   pr<U+00ED>ncipe    1 0.7326977
## 3279             reina    1 0.7253803
## 424843      Harthacnut    1 0.7045774
## 505451         Ragnald    1 0.7024436
## 445990     Sverkersson    1 0.7003423
## 15957          regente    1 0.6984726
## 264595       Hardeknut    1 0.6976819
## 305960  Ladul<U+00E1>s    1 0.6969224
similares(w1,x=pc)
##                   word freq   cos.sim
## 6051             reyes    1 0.7326543
## 3340             reino    1 0.7277504
## 3829   pr<U+00ED>ncipe    1 0.6925480
## 3279             reina    1 0.6853842
## 5012             trono    1 0.6601736
## 15957          regente    1 0.6569846
## 30259          vasallo    1 0.6322269
## 264595       Hardeknut    1 0.6300042
## 28527     pretendiente    1 0.6288754
## 21283         consorte    1 0.6282198
similares(w1,x=pcn)
##                   word freq   cos.sim
## 6051             reyes    1 0.7326543
## 3340             reino    1 0.7277504
## 3829   pr<U+00ED>ncipe    1 0.6925480
## 3279             reina    1 0.6853842
## 5012             trono    1 0.6601736
## 15957          regente    1 0.6569846
## 30259          vasallo    1 0.6322269
## 264595       Hardeknut    1 0.6300042
## 28527     pretendiente    1 0.6288754
## 21283         consorte    1 0.6282198
analogia(w1,w2,w3,x = emb)
##                   word freq   cos.sim
## 788              joven    1 0.6321573
## 63796    f<U+00E9>mina    1 0.6205990
## 40325        jovencita    1 0.6140978
## 14136         muchacha    1 0.5765563
## 432            persona    1 0.5680321
## 3064       ni<U+00F1>a    1 0.5657318
## 606320       trasvesti    1 0.5631110
## 301462 travest<U+00ED>    1 0.5522197
## 18179          anciana    1 0.5488744
## 500250       Acuchilla    1 0.5468871
analogia(w1,w2,w3,x = pc)
##                 word freq   cos.sim
## 63796  f<U+00E9>mina    1 0.5744859
## 788            joven    1 0.5691486
## 40325      jovencita    1 0.5578101
## 14136       muchacha    1 0.5080257
## 432          persona    1 0.5059552
## 3064     ni<U+00F1>a    1 0.5049733
## 18179        anciana    1 0.4807809
## 6425     adolescente    1 0.4715252
## 198865   sexagenaria    1 0.4687589
## 5450           chica    1 0.4678362
analogia(w1,w2,w3,x = pcn)
##                 word freq   cos.sim
## 63796  f<U+00E9>mina    1 0.5749063
## 788            joven    1 0.5728569
## 40325      jovencita    1 0.5576341
## 432          persona    1 0.5090779
## 14136       muchacha    1 0.5080447
## 3064     ni<U+00F1>a    1 0.5045333
## 18179        anciana    1 0.4819011
## 6425     adolescente    1 0.4735706
## 198865   sexagenaria    1 0.4695254
## 5450           chica    1 0.4678327

PC1

hist(pcn$PC1)

vpc1 <- vocab[ order(pcn$PC1,decreasing = T), ]
head(vpc1,n=10)
##                            word freq
## 800453 <U+AD6D><U+C81C><U+C120>    1
## 819987  <U+00C1>ltal<U+00E1>nos    1
## 837692                Inspektor    1
## 913756             Ogrodniczego    1
## 803819                     Seks    1
## 629651             Gospodarstwo    1
## 829811             Lundsgaarder    1
## 842692 <U+B77C><U+C6B4><U+C9C0>    1
## 994759                   LEGACY    1
## 922682                    actie    1
tail(vpc1,n=10)
##                        word freq
## 536001   venci<U+00E9>ndose    1
## 377783           estrecheza    1
## 440762             agazapan    1
## 572525            Wampagkit    1
## 324697     ocultar<U+00E1>n    1
## 481823       desembarazados    1
## 340768 import<U+00E1>ndoles    1
## 606030    cre<U+00E1>ndoles    1
## 668453           Bucellarii    1
## 428031            Esparavel    1
word.plot.dimension(pcn,target.dim = 1,sec.dim = 2,v = vocab,max.plot = 25)

last <- pcn[,ncol(pcn)]
hist(last)

vpc1 <- vocab[ order(last,decreasing = T), ]
head(vpc1,n=10)
##                word freq
## 31               ha    1
## 79     hab<U+00ED>a    1
## 379   hab<U+00ED>an    1
## 41              han    1
## 275           haber    1
## 1406  habr<U+00ED>a    1
## 1002          hayan    1
## 1291             Ha    1
## 5208 habr<U+00ED>an    1
## 5002        hubiese    1
tail(vpc1,n=10)
##              word freq
## 74483    devenido    1
## 11537      optado    1
## 4086     cambiado    1
## 4084       vuelto    1
## 167        estado    1
## 84           sido    1
## 2514     decidido    1
## 55737   perdurado    1
## 17010 sobrevivido    1
## 1612      llegado    1
word.plot.dimension(pcn,target.dim = ncol(pcn),sec.dim = 1,v = vocab,max.plot = 25)

Analisis de un grupo

Interesante: se encuentran numeros en los valores bajos de PC2 (jobwords)

word.plot.dimension(pcn,target.dim = "PC2",sec.dim = "PC1",max.plot = 10)

w <- "rey"
similares(w, x = pcn)
##                   word freq   cos.sim
## 6051             reyes    1 0.7326543
## 3340             reino    1 0.7277504
## 3829   pr<U+00ED>ncipe    1 0.6925480
## 3279             reina    1 0.6853842
## 5012             trono    1 0.6601736
## 15957          regente    1 0.6569846
## 30259          vasallo    1 0.6322269
## 264595       Hardeknut    1 0.6300042
## 28527     pretendiente    1 0.6288754
## 21283         consorte    1 0.6282198
d <- similares(w,n = 100, x = pcn,vectors = T)
dv <- vocab[rownames(d),]
word.plot(d,dims = c(1,2),v=dv)

#pca en ese conjunto
d.pca <- prcomp(d)
summary(d.pca)
## Importance of components:
##                           PC1     PC2     PC3     PC4     PC5     PC6
## Standard deviation     0.2690 0.20987 0.16515 0.14499 0.12936 0.12372
## Proportion of Variance 0.1632 0.09936 0.06153 0.04742 0.03775 0.03453
## Cumulative Proportion  0.1632 0.26258 0.32411 0.37153 0.40928 0.44381
##                            PC7     PC8     PC9    PC10    PC11    PC12
## Standard deviation     0.11562 0.10860 0.10289 0.09674 0.09373 0.08888
## Proportion of Variance 0.03015 0.02661 0.02388 0.02111 0.01982 0.01782
## Cumulative Proportion  0.47396 0.50056 0.52444 0.54555 0.56537 0.58319
##                           PC13    PC14    PC15    PC16    PC17    PC18
## Standard deviation     0.08691 0.08467 0.07807 0.07721 0.07571 0.07475
## Proportion of Variance 0.01704 0.01617 0.01375 0.01345 0.01293 0.01260
## Cumulative Proportion  0.60023 0.61640 0.63015 0.64360 0.65653 0.66914
##                           PC19    PC20    PC21    PC22    PC23    PC24
## Standard deviation     0.07380 0.07147 0.06978 0.06821 0.06696 0.06586
## Proportion of Variance 0.01229 0.01152 0.01099 0.01049 0.01012 0.00979
## Cumulative Proportion  0.68142 0.69295 0.70393 0.71443 0.72454 0.73433
##                           PC25    PC26    PC27    PC28    PC29    PC30
## Standard deviation     0.06451 0.06336 0.06241 0.06027 0.05948 0.05818
## Proportion of Variance 0.00939 0.00906 0.00879 0.00819 0.00798 0.00763
## Cumulative Proportion  0.74371 0.75277 0.76156 0.76975 0.77773 0.78536
##                           PC31    PC32    PC33    PC34    PC35    PC36
## Standard deviation     0.05788 0.05646 0.05600 0.05505 0.05389 0.05324
## Proportion of Variance 0.00756 0.00719 0.00707 0.00684 0.00655 0.00639
## Cumulative Proportion  0.79292 0.80011 0.80719 0.81402 0.82057 0.82697
##                           PC37    PC38    PC39    PC40    PC41    PC42
## Standard deviation     0.05227 0.05175 0.05067 0.04955 0.04903 0.04783
## Proportion of Variance 0.00616 0.00604 0.00579 0.00554 0.00542 0.00516
## Cumulative Proportion  0.83313 0.83917 0.84496 0.85050 0.85592 0.86108
##                           PC43    PC44    PC45    PC46    PC47    PC48
## Standard deviation     0.04718 0.04705 0.04654 0.04563 0.04510 0.04447
## Proportion of Variance 0.00502 0.00499 0.00489 0.00470 0.00459 0.00446
## Cumulative Proportion  0.86610 0.87110 0.87598 0.88068 0.88527 0.88973
##                           PC49    PC50    PC51    PC52    PC53    PC54
## Standard deviation     0.04345 0.04308 0.04243 0.04176 0.04141 0.04057
## Proportion of Variance 0.00426 0.00419 0.00406 0.00393 0.00387 0.00371
## Cumulative Proportion  0.89399 0.89817 0.90223 0.90617 0.91004 0.91375
##                           PC55    PC56    PC57    PC58    PC59    PC60
## Standard deviation     0.03994 0.03934 0.03894 0.03812 0.03758 0.03659
## Proportion of Variance 0.00360 0.00349 0.00342 0.00328 0.00319 0.00302
## Cumulative Proportion  0.91735 0.92084 0.92426 0.92753 0.93072 0.93374
##                           PC61    PC62    PC63    PC64    PC65    PC66
## Standard deviation     0.03583 0.03566 0.03499 0.03476 0.03395 0.03315
## Proportion of Variance 0.00290 0.00287 0.00276 0.00272 0.00260 0.00248
## Cumulative Proportion  0.93664 0.93950 0.94227 0.94499 0.94759 0.95007
##                           PC67    PC68    PC69    PC70    PC71    PC72
## Standard deviation     0.03245 0.03220 0.03176 0.03140 0.03087 0.03063
## Proportion of Variance 0.00238 0.00234 0.00228 0.00222 0.00215 0.00212
## Cumulative Proportion  0.95245 0.95479 0.95706 0.95929 0.96144 0.96355
##                           PC73    PC74    PC75    PC76    PC77    PC78
## Standard deviation     0.03018 0.02943 0.02927 0.02898 0.02870 0.02804
## Proportion of Variance 0.00206 0.00195 0.00193 0.00189 0.00186 0.00177
## Cumulative Proportion  0.96561 0.96756 0.96949 0.97139 0.97324 0.97502
##                           PC79    PC80    PC81    PC82    PC83    PC84
## Standard deviation     0.02791 0.02694 0.02688 0.02634 0.02601 0.02535
## Proportion of Variance 0.00176 0.00164 0.00163 0.00156 0.00153 0.00145
## Cumulative Proportion  0.97678 0.97841 0.98004 0.98161 0.98313 0.98458
##                           PC85    PC86    PC87    PC88    PC89    PC90
## Standard deviation     0.02484 0.02440 0.02388 0.02324 0.02288 0.02248
## Proportion of Variance 0.00139 0.00134 0.00129 0.00122 0.00118 0.00114
## Cumulative Proportion  0.98598 0.98732 0.98860 0.98982 0.99100 0.99214
##                           PC91    PC92    PC93    PC94    PC95    PC96
## Standard deviation     0.02201 0.02128 0.02058 0.02046 0.01994 0.01902
## Proportion of Variance 0.00109 0.00102 0.00096 0.00094 0.00090 0.00082
## Cumulative Proportion  0.99324 0.99426 0.99521 0.99616 0.99705 0.99787
##                           PC97    PC98    PC99     PC100
## Standard deviation     0.01846 0.01756 0.01720 1.195e-16
## Proportion of Variance 0.00077 0.00070 0.00067 0.000e+00
## Cumulative Proportion  0.99864 0.99933 1.00000 1.000e+00
plot(d.pca$sdev,type="o",pch="+")
text(x=seq(1,100),y=d.pca$sdev,labels = paste0("PC",seq(1,100)),cex=0.5,pos=4)

Con dos primeras dimensiones palabras y numeros son separables linealmente

word.plot(d.pca$x,v=dv)

Con PC1 y PC3 (casi solo PC3) las cantidades sin “$” son separables

word.plot(d.pca$x,v=dv,dims = c("PC1","PC3"))

Con PC4 cantidades “pequeñas” son sepabales

word.plot(d.pca$x,v=dv,dims = c("PC1","PC4"))

Con PC5 no hay nada evidente, con PC6 hay algunos errores de limpieza de datos

word.plot(d.pca$x,v=dv,dims = c("PC1","PC6"))

Un k-means muy interesante:

pca.cluster.similar(w,x = pcn,v = vocab,n=200,show.center.analysis = T)

## NULL
dclust<-pca.cluster.similar(w,x = pcn,v = vocab,n = 500,centers = 11)

cluster.wordcloud(dclust,scale = c(3,0.5),by.sim=T)

## Warning in wordcloud(words = dclust.sub$word, freq = f, colors = colors, :
## Ramathibodi could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = dclust.sub$word, freq = f, colors = colors, :
## Tanutamani could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = dclust.sub$word, freq = f, colors = colors, :
## Erishum could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = dclust.sub$word, freq = f, colors = colors, :
## Fil<U+00F3>pator could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = dclust.sub$word, freq = f, colors = colors, :
## Evergetes could not be fit on page. It will not be plotted.

Comparar con el kmeans usando el espacio original:

d <- similares(w,n = 100, x = emb,vectors = T)
dv <- vocab[rownames(d),]
k <- kmeans(d[,1:8],6)
word.plot(d[,1:2],v=dv,col=k$cluster)

Algunas estadisticas interesantes

#head(emb,n = 10)
#similares("france",x=emb)

#Algunas estadisticas interesantes
#summary(emb$V200)
#qqplot(emb$V200,rnorm(1000,mean=mean(emb$V200),sd=sd(emb$V200)),pch=".")
#abline(a=0,b=1,col="red")
#norms<-apply(emb,1,function(x){sqrt(sum(x^2))})
#hist(norms)

#similitud promedio de los 10 primeros resultados para una muestra aleatoria de 100 palabras
#s<-sapply(vocab$V2[sample(nrow(vocab),100)],FUN=function(x){mean(similares(x)$cos.sim)})
#hist(s)